# ! pip install feature_engine
# ! pip install pandas==2.0.3
# !pip install pmdarima
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from prophet import Prophet
warnings.filterwarnings('ignore')
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1) items.csv: This data contains information about items.
| Variables | Description |
|---|---|
| id | Unique identification of the item |
| store_id | Unique identification of the store |
| name | Name of the item |
| kcal | A measure of energy nutrients (calories) in the item |
| cost | The unit price of the item bought |
items_df = pd.read_csv('/content/drive/MyDrive/SalesForecasting_Datasets/items.csv')
items_df.head()
| id | store_id | name | kcal | cost | |
|---|---|---|---|---|---|
| 0 | 1 | 4 | Chocolate Cake | 554 | 6.71 |
| 1 | 2 | 4 | Breaded Fish with Vegetables Meal | 772 | 15.09 |
| 2 | 3 | 1 | Sweet Fruity Cake | 931 | 29.22 |
| 3 | 4 | 1 | Amazing Steak Dinner with Rolls | 763 | 26.42 |
| 4 | 5 | 5 | Milk Cake | 583 | 6.07 |
2) restaurants.csv: This data provides information about the restaurants or store.
| Variables | Description |
|---|---|
| id | Unique identification of the restaurant or store |
| Name of the restaurant | restaurants.csv: This data provides information about the restaurants or store |
restaurants_df = pd.read_csv('/content/drive/MyDrive/SalesForecasting_Datasets/resturants.csv')
restaurants_df.head()
| id | name | |
|---|---|---|
| 0 | 1 | Bob's Diner |
| 1 | 2 | Beachfront Bar |
| 2 | 3 | Sweet Shack |
| 3 | 4 | Fou Cher |
| 4 | 5 | Corner Cafe |
3) sales.csv: This contains the count of a particular item sold at a particular store or restaurant for different dates.
| Variables | Description |
|---|---|
| date | Date of purchase |
| item | Name of the item bought |
| Price | Unit price of the item |
| item_count | Total count of the items bought on that day |
sales_df = pd.read_csv('/content/drive/MyDrive/SalesForecasting_Datasets/sales.csv', parse_dates=['date'])
sales_df.head()
| date | item_id | price | item_count | |
|---|---|---|---|---|
| 0 | 2019-01-01 | 3 | 29.22 | 2.0 |
| 1 | 2019-01-01 | 4 | 26.42 | 22.0 |
| 2 | 2019-01-01 | 12 | 4.87 | 7.0 |
| 3 | 2019-01-01 | 13 | 4.18 | 12.0 |
| 4 | 2019-01-01 | 16 | 3.21 | 136.0 |
1. Items
Check data shape and size
items_df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 100 non-null int64 1 store_id 100 non-null int64 2 name 100 non-null object 3 kcal 100 non-null int64 4 cost 100 non-null float64 dtypes: float64(1), int64(3), object(1) memory usage: 11.1 KB
Obervations:
Numerical = 4: 'id', 'store_id', 'kcal', 'cost'
Categorical = 1: 'name'
Check for misssing values
items_df.isna().mean()
id 0.0 store_id 0.0 name 0.0 kcal 0.0 cost 0.0 dtype: float64
Observations:
There are no null/ missing values in this dataset
Check for outliers
columns = ['id', 'store_id', 'kcal', 'cost']
titles =['Item ID', 'Store ID', 'Kcal', 'Cost']
fig, axes = plt.subplots(nrows = len(columns), ncols = 2, figsize = (12,8))
plt.subplots_adjust(hspace = 0.7) # increase vertical space between graphs
for i, col in enumerate(columns):
sns.distplot(items_df[col], ax = axes[i,0])
items_df.boxplot(column = col, ax = axes[i,1], grid = False, vert = False)
axes[i,0].set_title(titles[i], loc='left', fontweight='bold', color='#333333', pad=20)
axes[i,1].set_yticks([])
sns.despine()
Observations:
Check distributions
items_df.describe()
| id | store_id | kcal | cost | |
|---|---|---|---|---|
| count | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
| mean | 50.500000 | 3.520000 | 536.730000 | 11.763700 |
| std | 29.011492 | 1.708446 | 202.212852 | 8.991254 |
| min | 1.000000 | 1.000000 | 78.000000 | 1.390000 |
| 25% | 25.750000 | 2.000000 | 406.250000 | 5.280000 |
| 50% | 50.500000 | 4.000000 | 572.500000 | 7.625000 |
| 75% | 75.250000 | 5.000000 | 638.250000 | 18.790000 |
| max | 100.000000 | 6.000000 | 1023.000000 | 53.980000 |
Observations:
Sclaing is required.
items_df.describe(include='object')
| name | |
|---|---|
| count | 100 |
| unique | 94 |
| top | Milky Cake |
| freq | 3 |
Observations:
2. Restaurants
Check data shape and size
restaurants_df.info(memory_usage = 'deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6 entries, 0 to 5 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 6 non-null int64 1 name 6 non-null object dtypes: int64(1), object(1) memory usage: 581.0 bytes
Observations:
Numerical 1: 'id'
Categorical 1: 'name'
Check for missing values
restaurants_df.isna().mean()
id 0.0 name 0.0 dtype: float64
Observations:
There are no missing values in this dataset.
Check for outliers
columns = ['id']
titles = ['Restaurant Id']
fig, axes = plt.subplots(nrows = len(columns), ncols = 2, figsize = (12,5))
plt.subplots_adjust(hspace = 0.7)
for i, col in enumerate(columns):
sns.distplot(restaurants_df[col], ax = axes[0])
restaurants_df.boxplot(column = col, grid = False, vert = False, ax = axes[1])
axes[0].set_title(titles[i], loc='left', fontweight='bold', color='#333333', pad=20)
axes[1].set_yticks([])
sns.despine()
Obervations:
No outliers detected
Check distributions
restaurants_df.describe()
| id | |
|---|---|
| count | 6.000000 |
| mean | 3.500000 |
| std | 1.870829 |
| min | 1.000000 |
| 25% | 2.250000 |
| 50% | 3.500000 |
| 75% | 4.750000 |
| max | 6.000000 |
Observations:
'id': This is an identity column having normal distribution (mean = median).
Scale ranges from 1 - 6
restaurants_df.describe(include='object')
| name | |
|---|---|
| count | 6 |
| unique | 6 |
| top | Bob's Diner |
| freq | 1 |
Observations:
There are 6 unique restaurants
3. Sales
Check data shape and size
sales_df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 109600 entries, 0 to 109599 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 109600 non-null datetime64[ns] 1 item_id 109600 non-null int64 2 price 109600 non-null float64 3 item_count 109600 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(1) memory usage: 3.3 MB
Observations:
Numerical = 4 : 'item_id', 'price', 'item_count'
Date = 1: 'date'
Check for missing values
sales_df.isna().mean()
date 0.0 item_id 0.0 price 0.0 item_count 0.0 dtype: float64
Observations:
There are no missing values in this dataset.
Check for outliers
columns = ['item_id', 'price', 'item_count']
titles = ['Item Id', 'Price', 'Item count']
fig, axes = plt.subplots(nrows = len(columns), ncols = 2, figsize=(12,8))
plt.subplots_adjust(hspace = 0.7)
for i, col in enumerate(columns):
sns.distplot(sales_df[col], ax = axes[i,0])
sales_df.boxplot(column = col, ax =axes[i,1], grid = False, vert = False)
axes[i,0].set_title(titles[i], loc='left', fontweight='bold', color='#333333', pad=20)
axes[i,1].set_yticks([])
sns.despine()
Observations:
'price' and 'item_count' contain outliers that need treatment.
Check distributions
sales_df.describe()
| date | item_id | price | item_count | |
|---|---|---|---|---|
| count | 109600 | 109600.000000 | 109600.000000 | 109600.000000 |
| mean | 2020-07-01 12:00:00 | 50.500000 | 11.763700 | 6.339297 |
| min | 2019-01-01 00:00:00 | 1.000000 | 1.390000 | 0.000000 |
| 25% | 2019-10-01 18:00:00 | 25.750000 | 5.280000 | 0.000000 |
| 50% | 2020-07-01 12:00:00 | 50.500000 | 7.625000 | 0.000000 |
| 75% | 2021-04-01 06:00:00 | 75.250000 | 18.790000 | 0.000000 |
| max | 2021-12-31 00:00:00 | 100.000000 | 53.980000 | 570.000000 |
| std | NaN | 28.866202 | 8.946225 | 30.003728 |
Observations:
Scaling is required
sales_item = sales_df.merge(items_df, how='left', left_on = 'item_id', right_on = 'id' )
sales_merged = sales_item.merge(restaurants_df, how='left', left_on = 'store_id', right_on = 'id', suffixes = ('_sales', '_rest') )
# Drop unwanted columns
sales_merged.drop(columns = ['id_sales', 'id_rest','cost'], inplace = True)
# calculate sales
sales_merged['sales'] = sales_merged['item_count'] * sales_merged['price']
# Rename columns
sales_merged.rename(columns = {'name_sales':'item_name', 'name_rest':'store_name'}, inplace=True)
# Rearrange columns
sales_merged_with_outliers = sales_merged[['date', 'item_id', 'price', 'item_count', 'item_name', 'kcal', 'store_id', 'store_name', 'sales']]
sales_merged_with_outliers.head()
| date | item_id | price | item_count | item_name | kcal | store_id | store_name | sales | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 | 3 | 29.22 | 2.0 | Sweet Fruity Cake | 931 | 1 | Bob's Diner | 58.44 |
| 1 | 2019-01-01 | 4 | 26.42 | 22.0 | Amazing Steak Dinner with Rolls | 763 | 1 | Bob's Diner | 581.24 |
| 2 | 2019-01-01 | 12 | 4.87 | 7.0 | Fantastic Sweet Cola | 478 | 1 | Bob's Diner | 34.09 |
| 3 | 2019-01-01 | 13 | 4.18 | 12.0 | Sweet Frozen Soft Drink | 490 | 1 | Bob's Diner | 50.16 |
| 4 | 2019-01-01 | 16 | 3.21 | 136.0 | Frozen Milky Smoothy | 284 | 1 | Bob's Diner | 436.56 |
from feature_engine.outliers import Winsorizer
# Instatiate
capper = Winsorizer(
capping_method = 'gaussian',
tail = 'right',
fold = 3,
variables = ['kcal', 'price', 'item_count']
)
# Fit-Transform
sales_merged_no_outliers = capper.fit_transform(sales_merged_with_outliers)
# Rounding off item_count
sales_merged_no_outliers['item_count'] = round(sales_merged_no_outliers['item_count'])
sales_merged_no_outliers.head()
| date | item_id | price | item_count | item_name | kcal | store_id | store_name | sales | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 | 3 | 29.22 | 2.0 | Sweet Fruity Cake | 931 | 1 | Bob's Diner | 58.44 |
| 1 | 2019-01-01 | 4 | 26.42 | 22.0 | Amazing Steak Dinner with Rolls | 763 | 1 | Bob's Diner | 581.24 |
| 2 | 2019-01-01 | 12 | 4.87 | 7.0 | Fantastic Sweet Cola | 478 | 1 | Bob's Diner | 34.09 |
| 3 | 2019-01-01 | 13 | 4.18 | 12.0 | Sweet Frozen Soft Drink | 490 | 1 | Bob's Diner | 50.16 |
| 4 | 2019-01-01 | 16 | 3.21 | 96.0 | Frozen Milky Smoothy | 284 | 1 | Bob's Diner | 436.56 |
day of the week, quarter of the year, month, year, day of the month and total sales
# Create time series features
def create_ts_features(df):
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofweek'] = df['date'].dt.dayofweek
return df
sales_merged = sales_merged_no_outliers.copy()
create_ts_features(sales_merged)
sales_merged = sales_merged[['date', 'year', 'quarter', 'month', 'day', 'dayofyear', 'dayofweek', 'store_id', 'store_name', 'item_id', 'item_name', 'kcal', 'price', 'item_count', 'sales']]
sales_merged.head()
| date | year | quarter | month | day | dayofyear | dayofweek | store_id | store_name | item_id | item_name | kcal | price | item_count | sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | Bob's Diner | 3 | Sweet Fruity Cake | 931 | 29.22 | 2.0 | 58.44 |
| 1 | 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | Bob's Diner | 4 | Amazing Steak Dinner with Rolls | 763 | 26.42 | 22.0 | 581.24 |
| 2 | 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | Bob's Diner | 12 | Fantastic Sweet Cola | 478 | 4.87 | 7.0 | 34.09 |
| 3 | 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | Bob's Diner | 13 | Sweet Frozen Soft Drink | 490 | 4.18 | 12.0 | 50.16 |
| 4 | 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | Bob's Diner | 16 | Frozen Milky Smoothy | 284 | 3.21 | 96.0 | 436.56 |
plt.figure(figsize=(12,5))
sns.lineplot(x=sales_merged['date'], y=sales_merged['sales'])
plt.show()
Observations:
year = 2021
day = {
0: 'Mon',
1: 'Tue',
2: 'Wed',
3: 'Thu',
4: 'Fri',
5: 'Sat',
6: 'Sun'
}
# create a subset of sales data for a single year 2021
sales_2021 = sales_merged[sales_merged['year'] == year]
sales_2021['day_name'] = sales_2021['dayofweek'].apply(lambda x: day.get(x))
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
sns.barplot(data = sales_2021, x = 'day_name', y = 'sales', order = day_order)
plt.show()
Observations:
# Add month name to the dataset
month = {
1: 'Jan',
2: 'Feb',
3: 'Mar',
4: 'Apr',
5: 'May',
6: 'Jun',
7: 'Jul',
8: 'Aug',
9: 'Sep',
10: 'Oct',
11: 'Nov',
12: 'Dec'
}
sales_2021['month_name'] = sales_merged['month'].apply(lambda x: month.get(x))
# Plot the graph
sns.lineplot(data = sales_2021, x = 'month_name', y = 'sales')
plt.show()
Observation:
# calculating sales across different quarters averaged over the years
qtr_avg = pd.DataFrame(sales_merged.groupby('quarter')['sales'].mean()).reset_index()
# Plot the graph
sns.barplot(data = qtr_avg,
x = qtr_avg.quarter.apply(lambda x: str(x)+'Q'),
y = 'sales'
)
plt.show()
Observations:
For the last 3 years we see that:
Finding out which restaurant had the most sales
store_sales = pd.DataFrame(sales_merged.groupby(['store_name'])['sales'].sum()).reset_index()
# compute percentage sales
total_sales = store_sales['sales'].sum()
store_sales['percentages'] = (store_sales['sales']/ total_sales) * 100
# Plot the pie
plt.figure(figsize=(6,6))
plt.pie(store_sales['percentages'], labels = store_sales['store_name'], autopct='')
plt.legend(loc='best')
plt.title('Store-wise Sales Contribution')
plt.show()
Observations:
Comparing sales for each restaurant across different years, months, and days
# Let's build aggregated subsets before plotting the graphs
store_list = ["Bob's Diner", 'Beachfront Bar','Corner Cafe','Fou Cher', 'Surfs Up', 'Sweet Shack']
# calculate yearly sales in thousands (K)
store_sales_year = pd.DataFrame(sales_merged.groupby(['store_name','year'])['sales'].sum()).reset_index()
store_sales_year['sales_K'] = round(store_sales_year['sales']/1000)
# calculate monthly avg
store_sales_month = pd.DataFrame(sales_merged.groupby(['store_name','month'])['sales'].mean()).reset_index()
# calculate daily avg
store_sales_day = pd.DataFrame(sales_merged.groupby(['store_name','day'])['sales'].mean()).reset_index()
# Let's plot the comparison graph
fig, axes = plt.subplots(nrows = len(store_list), ncols = 3, figsize=(18,12))
plt.subplots_adjust(hspace=1.5, wspace=0.5)
for i, store in enumerate(store_list):
sns.barplot(data = store_sales_year[store_sales_year['store_name'] == store], x = 'year', y = 'sales_K', ax = axes[i,0])
sns.lineplot(data = store_sales_month[store_sales_month['store_name'] == store], x = 'month', y = 'sales', ax = axes[i,1])
sns.lineplot(data = store_sales_day[store_sales_day['store_name'] == store], x = 'day', y = 'sales', ax = axes[i,2])
axes[i,0].set_title(store_list[i], loc='left', fontweight='bold', color='#333333', pad=20)
axes[i,0].set_ylabel('')
axes[i,1].set_ylabel('')
axes[i,2].set_ylabel('')
sns.despine()
plt.show()
Observations:
Fetching the top 10 most popular items overall
# compute how many times an item has been purchased on an avg. in the last 3 years
top5_items = pd.DataFrame(round(sales_merged.groupby(['item_id','item_name'])['item_count'].mean())).reset_index()
top5_items = top5_items.sort_values('item_count', ascending=False).head(5).reset_index().drop('index', axis=1) # fetch top 10 items
plt.figure(figsize=(10,5))
sns.barplot(data = top5_items, y = 'item_name', x = 'item_count')
plt.title('Top 5 most popular items')
sns.despine()
plt.show()
Which are the stores selling the most popular items
x = sales_merged[sales_merged['item_id'].isin(top5_items.item_id)]
x = pd.DataFrame(x.groupby(['store_name','item_name'])['sales'].mean()).reset_index()
x = x.sort_values('sales', ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(data=x, x='store_name', y='sales', hue='item_name')
plt.xlabel('')
plt.title('Store(s) selling top 5 items')
plt.show()
Observations:
Bob's Diner is the only store selling the 5 most popular items
Finding out the most popular item at each store
# create the aggregate dataset
top5_store = pd.DataFrame()
for i, store in enumerate(store_list):
store_sales = sales_merged[sales_merged['store_name'] == store]
top5_items = pd.DataFrame(round(store_sales.groupby(['store_name', 'item_name'])[['item_count', 'sales']].sum())).reset_index()
y = top5_items.sort_values('item_count', ascending=False).head(5).reset_index().drop('index', axis=1)
top5_store = pd.concat([top5_store, y])
# plot the top 5 items at each store by number of items sold. Size of the bubble indicates total sales value
plt.figure(figsize=(18,12))
plt.subplots_adjust(hspace=0.7, wspace=1)
for i, store in enumerate(store_list):
plt.subplot(3,2,i+1)
sns.scatterplot(data = top5_store[top5_store.store_name == store],
y = 'item_name',
x = 'item_count',
size = 'sales',
sizes = (20, 200),
hue = 'sales' ,
palette = 'deep',
)
plt.ylabel('')
plt.xlabel('')
plt.title(store, loc='left', fontweight='bold', color='#333333', pad=20)
plt.legend(loc='lower right')
sns.despine()
plt.show()
# creating the data subset
x = pd.DataFrame(sales_merged.groupby(['store_name','dayofweek'])[['item_count', 'sales']].mean()).reset_index()
x['day_name'] = x['dayofweek'].apply(lambda x: day.get(x))
# plot the graph
plt.figure(figsize=(12,6))
plt.subplots_adjust(wspace=0.7)
plt.subplot(1,2,1)
sns.barplot(data = x, x = 'day_name', y = 'item_count', hue='store_name')
plt.legend(loc='upper left')
plt.title('Average volume sold per day')
plt.subplot(1,2,2)
sns.barplot(data = x, x = 'day_name', y = 'sales', hue='store_name', legend=None)
plt.title('Average money made per day')
sns.despine()
plt.show()
Observations:
It is clear that Bob's Diner having the highest sales volume is also making the most money per day.
# creating a subset containing the most expensive items at each store
exp_items_store = pd.DataFrame()
for i, store in enumerate(store_list):
store_sales = sales_merged[sales_merged['store_name'] == store]
exp_items = pd.DataFrame(store_sales.groupby(['store_name', 'item_name'])[['price', 'kcal']].max()).reset_index()
y = exp_items.sort_values('price', ascending=False).head(1).reset_index().drop('index', axis=1)
exp_items_store = pd.concat([exp_items_store, y])
exp_items_store = exp_items_store.sort_values('price', ascending = False)
# plotting a grpah that show the most expensive items in descending order of their price.
# the graph also depicts calories in these dishes
plt.figure(figsize=(10,5))
sns.barplot(data = exp_items_store,
x = 'store_name',
y = 'kcal',
hue = 'item_name'
)
plt.xlabel('')
plt.show()
Observations:
fc_data = sales_merged[['date', 'year', 'quarter', 'month', 'day', 'dayofyear', 'dayofweek', 'store_id', 'item_count']]
# create daily aggregates for each store
fc_daily_agg = pd.DataFrame(round(fc_data.groupby(['date', 'year', 'quarter', 'month', 'day', 'dayofyear', 'dayofweek', 'store_id'])['item_count'].mean())).reset_index()
fc_daily_agg = fc_daily_agg.set_index('date')
fc_daily_agg.sort_index(inplace=True) # sort dates
# create monthly aggregates for each store
fc1_agg = pd.DataFrame(fc_daily_agg.groupby(['year', 'month', 'store_id'])['item_count'].mean()).reset_index()
# add day column
fc1_agg['day'] = 1
# create date
fc1_agg['date'] = pd.to_datetime(fc1_agg[['year', 'month','day']])
# store-wise monthly aggregate
fc1_agg = fc1_agg[['date','store_id', 'item_count']]
# create additional time series features
create_ts_features(fc1_agg)
fc1_agg = fc1_agg[['date','year', 'quarter', 'month', 'day', 'dayofyear', 'dayofweek', 'store_id', 'item_count']].set_index('date')
fc1_agg.head()
| year | quarter | month | day | dayofyear | dayofweek | store_id | item_count | |
|---|---|---|---|---|---|---|---|---|
| date | ||||||||
| 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 1 | 13.580645 |
| 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 2 | 0.000000 |
| 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 3 | 0.000000 |
| 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 4 | 0.000000 |
| 2019-01-01 | 2019 | 1 | 1 | 1 | 1 | 1 | 5 | 0.000000 |
# Splitting data into train and test. We will take the last 6 months as test data
split_date = '2021-07-01'
train = fc1_agg.loc[fc1_agg.index < split_date].copy()
test = fc1_agg.loc[fc1_agg.index >= split_date].copy()
print('Train data range: ', min(train.index) ,'-', max(train.index))
print('Test data range : ', min(test.index) ,'-', max(test.index))
Train data range: 2019-01-01 00:00:00 - 2021-06-01 00:00:00 Test data range : 2021-07-01 00:00:00 - 2021-12-01 00:00:00
Visualizing train and test data for each store
# Creating a figure
fig, axs = plt.subplots(3,2, figsize=(18, 10))
# Flatten the axes array for easy iteration
axs = axs.flatten()
# Adding subplots in a loop
for i, store in enumerate(store_list):
train_plt = train[train['store_id'] == i+1]
test_plt = test[test['store_id'] == i+1]
axs[i].plot(train_plt.index, train_plt.item_count, label='train')
axs[i].plot(test_plt.index, test_plt.item_count, label='test')
axs[i].set_title(store, loc='left', fontweight='bold', color='#333333')
axs[0].legend(loc='upper left')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Instantiate
xgb_model = xgb.XGBRegressor(n_estimators = 1000)
lr_model = LinearRegression()
rfc_model = RandomForestRegressor(n_estimators = 1000, max_depth=5, random_state=123, min_samples_split=5)
# Define algorithms to try
algorithms = [
('LinearRegression', lr_model),
('RandomForest', rfc_model),
('XGBoost', xgb_model),
]
# Define a function to evaluate model performance
def evaluate_model(store, name, model, X_train, X_test, y_train, y_test):
start_time = time.time()
# Fit
model.fit(X_train, y_train)
# Predict
y_test_pred = model.predict(X_test)
# Evaluate
rmse = round(np.sqrt(mean_squared_error(y_test, y_test_pred)),3)
runtime = time.time() - start_time
metrics=[]
metrics.append(
{
'Store' : store,
'Algorithm' : name,
'Root Mean Squared Error' : rmse,
'Runtime (in secs)' : runtime
}
)
return pd.DataFrame(metrics)
# Evaluate individual model performance for each store
model_comparison = pd.DataFrame()
for i, store_name in enumerate(store_list):
train_store = train[train['store_id']==i+1].drop('store_id', axis=1)
test_store = test[test['store_id']==i+1].drop('store_id', axis=1)
# split the train and test datasets into X and y variables
X_train, y_train = train_store.drop('item_count', axis=1), train_store['item_count']
X_test, y_test = test_store.drop('item_count', axis=1), test_store['item_count']
for name, model in algorithms:
metrics = evaluate_model(store_name, name, model, X_train, X_test, y_train, y_test)
model_comparison = pd.concat([model_comparison, metrics], axis = 0)
model_comparison
| Store | Algorithm | Root Mean Squared Error | Runtime (in secs) | |
|---|---|---|---|---|
| 0 | Bob's Diner | LinearRegression | 3.450 | 0.009614 |
| 0 | Bob's Diner | RandomForest | 1.149 | 1.774468 |
| 0 | Bob's Diner | XGBoost | 1.821 | 2.799378 |
| 0 | Beachfront Bar | LinearRegression | 0.039 | 0.042825 |
| 0 | Beachfront Bar | RandomForest | 0.030 | 1.610173 |
| 0 | Beachfront Bar | XGBoost | 0.023 | 0.137608 |
| 0 | Corner Cafe | LinearRegression | 0.019 | 0.004484 |
| 0 | Corner Cafe | RandomForest | 0.022 | 1.123541 |
| 0 | Corner Cafe | XGBoost | 0.028 | 0.129682 |
| 0 | Fou Cher | LinearRegression | 0.000 | 0.006117 |
| 0 | Fou Cher | RandomForest | 0.000 | 1.128829 |
| 0 | Fou Cher | XGBoost | 0.000 | 0.133886 |
| 0 | Surfs Up | LinearRegression | 0.000 | 0.004126 |
| 0 | Surfs Up | RandomForest | 0.000 | 1.132962 |
| 0 | Surfs Up | XGBoost | 0.000 | 0.138709 |
| 0 | Sweet Shack | LinearRegression | 0.007 | 0.004008 |
| 0 | Sweet Shack | RandomForest | 0.004 | 1.106282 |
| 0 | Sweet Shack | XGBoost | 0.002 | 0.133504 |
# For Store 1
data = pd.DataFrame(fc1_agg[fc1_agg.store_id == 1]['item_count'], columns=['item_count'])
plt.figure(figsize=(15,4))
plt.plot(data)
plt.show()
# Check for stationarity of time series using Augmented Dickey Fuller test (ADF)
from statsmodels.tsa.stattools import adfuller
def test_stationarity(df):
# Dickey Fuller test:
df_test = adfuller(df)
print('Results of Dickey Fuller test:')
# 0 - 3rd element
df_output = pd.Series(df_test[0:4], index=['Test statistic',
'p-value',
'#Lags used',
'#Observations used']
)
# 4th element - dict
for key, value in df_test[4].items():
df_output['Critical value (%s)' %key] = value
print(df_output)
test_stationarity(data)
Results of Dickey Fuller test: Test statistic -0.232962 p-value 0.934477 #Lags used 8.000000 #Observations used 27.000000 Critical value (1%) -3.699608 Critical value (5%) -2.976430 Critical value (10%) -2.627601 dtype: float64
Observations:
Null hypothesis: The time series data is not stationary
Alternate hypothesis: The the time series data is stationary
1) Based on p-value
2) Based on test statistic and critical value
From the above test values, we infer that out dataset is not stationary.
Since our time series is not stationary and has seasonality, we will use Seasonal ARIMA (SARIMA)
# prepare data for log transformation
data = fc1_agg.reset_index()
data = data[['date','store_id','item_count']].set_index(['date','store_id'])
data.head()
| item_count | ||
|---|---|---|
| date | store_id | |
| 2019-01-01 | 1 | 13.580645 |
| 2 | 0.000000 | |
| 3 | 0.000000 | |
| 4 | 0.000000 | |
| 5 | 0.000000 |
# applying log transform on entire data
log_data = np.log(data).reset_index()
log_data.set_index('date', inplace=True)
log_data['item_count'].replace(-np.inf, 0, inplace=True) # replace - infinity values with zero
log_data.head()
| store_id | item_count | |
|---|---|---|
| date | ||
| 2019-01-01 | 1 | 2.608646 |
| 2019-01-01 | 2 | 0.000000 |
| 2019-01-01 | 3 | 0.000000 |
| 2019-01-01 | 4 | 0.000000 |
| 2019-01-01 | 5 | 0.000000 |
# Splitting data into train and test. We will take the last 6 months as test data
split_date = '2021-07-01'
train = log_data.loc[log_data.index < split_date].copy()
test = log_data.loc[log_data.index >= split_date].copy()
train.head()
| store_id | item_count | |
|---|---|---|
| date | ||
| 2019-01-01 | 1 | 2.608646 |
| 2019-01-01 | 2 | 0.000000 |
| 2019-01-01 | 3 | 0.000000 |
| 2019-01-01 | 4 | 0.000000 |
| 2019-01-01 | 5 | 0.000000 |
# Plotting original and log transformed data for store 1
orig_store1 = fc1_agg[fc1_agg.store_id==1]
train_store1 = train[train.store_id==1]
test_store1 = test[test.store_id==1]
plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.plot(orig_store1.index, orig_store1.item_count)
plt.title('Store 1: Original data')
plt.subplot(1,2,2)
plt.plot(train_store1.index, train_store1.item_count, label='train')
plt.plot(test_store1.index, test_store1.item_count, label='test')
plt.title('Store 1: Log transformed data')
plt.legend()
plt.show()
log_store1 = log_data[log_data.store_id==1]['item_count']
moving_avg = log_store1.rolling(window=2).mean()
plt.figure(figsize=(8,4))
plt.plot(log_store1)
plt.plot(moving_avg, color='orange')
plt.show()
# calculate difference between moving avg and actual item count
ts_log_mv_diff = log_store1 - moving_avg
# remove nulls
ts_log_mv_diff.dropna(inplace=True)
plt.figure(figsize=(8,4))
plt.plot(ts_log_mv_diff)
plt.show()
Re-checking for stationarity
test_stationarity(ts_log_mv_diff)
Results of Dickey Fuller test: Test statistic -3.970580 p-value 0.001573 #Lags used 9.000000 #Observations used 25.000000 Critical value (1%) -3.723863 Critical value (5%) -2.986489 Critical value (10%) -2.632800 dtype: float64
Observations:
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_mv_diff, nlags = 10)
lag_pacf = pacf(ts_log_mv_diff, nlags = 10, method='ols')
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.plot(lag_acf)
# draw zero line
plt.axhline(y = 0, linestyle='--', color='green')
# draw upper confidence interval
plt.axhline(y = 1.96/ np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='green')
# draw lower confidence interval
plt.axhline(y = -1.96/ np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='green')
plt.title('Autocorrelation function')
plt.xlabel('No. of lags')
plt.ylabel('ACF values')
plt.subplot(1,2,2)
plt.plot(lag_pacf)
# draw zero line
plt.axhline(y = 0, linestyle='--', color='green')
# draw upper confidence interval
plt.axhline(y = 1.96/ np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='green')
# draw lower confidence interval
plt.axhline(y = -1.96/ np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='green')
plt.title('Partial Autocorrelation function')
plt.xlabel('No. of lags')
plt.ylabel('PACF values')
plt.show()
Observations:
Using the acf and pacf ranges, we can now build the sarima model for each store and evalute on test data
# Forecasting function
def forecast_sarima(model, periods = 12):
# calculate forecast
results, conf_int = model.predict(n_periods = periods, return_conf_int = True)
# create index for forecasted results
index_fc = results.index
lower_band_plt = pd.Series(conf_int[:,0], index = index_fc)
upper_band_plt = pd.Series(conf_int[:,1], index = index_fc)
return results, lower_band_plt, upper_band_plt
# Model building and evaluation for each store
import pmdarima as pm
# Creating a figure
fig, axs = plt.subplots(3,2, figsize=(18, 10))
# Flatten the axes array for easy iteration
axs = axs.flatten()
# Adding subplots in a loop
for i, store in enumerate(store_list):
start_time = time.time()
train_plt = train[train['store_id'] == i+1]['item_count']
test_plt = test[test['store_id'] == i+1]['item_count']
SARIMA_model = pm.auto_arima(train_plt,
start_p = 1,
start_P = 0,
start_q = 3,
max_p = 3,
max_q = 5,
test = 'adf',
m = 12, # freq of seasonality: once every 12 months
seasonal = True,
d = None, # let model find differencing param
D = 1, # order of seasonal differencing
trace = False, # logs
error_action = 'ignore', # silence errors
supress_warnings = True,
stepwise = True,
)
test_pred, lower_band_plt, upper_band_plt = forecast_sarima(SARIMA_model, 6)
runtime = time.time() - start_time
# Compute metrics
rmse = round(np.sqrt(mean_squared_error(test_plt, test_pred)),3)
metrics=[]
metrics.append(
{
'Store': store,
'Algorithm': 'SARIMA',
'Root Mean Squared Error': rmse,
'Runtime (in secs)': runtime
}
)
metrics_df = pd.DataFrame(metrics)
model_comparison = pd.concat([model_comparison, metrics_df], axis=0)
axs[i].plot(train_plt, label='train')
axs[i].plot(test_plt, label='test')
axs[i].plot(test_pred, linestyle='--', color='darkgreen', label = 'Forecast')
axs[i].fill_between(lower_band_plt.index, lower_band_plt, upper_band_plt, color='k', alpha=0.15)
axs[i].set_title(store, loc='left', fontweight='bold', color='#333333')
axs[0].legend(loc='upper left')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
model_comparison=model_comparison.sort_values('Store')
model_comparison
| Store | Algorithm | Root Mean Squared Error | Runtime (in secs) | |
|---|---|---|---|---|
| 0 | Beachfront Bar | LinearRegression | 0.039 | 0.042825 |
| 0 | Beachfront Bar | RandomForest | 0.030 | 1.610173 |
| 0 | Beachfront Bar | XGBoost | 0.023 | 0.137608 |
| 0 | Beachfront Bar | SARIMA | 2.024 | 4.231126 |
| 0 | Bob's Diner | LinearRegression | 3.450 | 0.009614 |
| 0 | Bob's Diner | RandomForest | 1.149 | 1.774468 |
| 0 | Bob's Diner | XGBoost | 1.821 | 2.799378 |
| 0 | Bob's Diner | SARIMA | 0.133 | 13.243634 |
| 0 | Corner Cafe | LinearRegression | 0.019 | 0.004484 |
| 0 | Corner Cafe | RandomForest | 0.022 | 1.123541 |
| 0 | Corner Cafe | XGBoost | 0.028 | 0.129682 |
| 0 | Corner Cafe | SARIMA | 2.065 | 28.520054 |
| 0 | Fou Cher | SARIMA | 0.000 | 0.040050 |
| 0 | Fou Cher | XGBoost | 0.000 | 0.133886 |
| 0 | Fou Cher | RandomForest | 0.000 | 1.128829 |
| 0 | Fou Cher | LinearRegression | 0.000 | 0.006117 |
| 0 | Surfs Up | XGBoost | 0.000 | 0.138709 |
| 0 | Surfs Up | LinearRegression | 0.000 | 0.004126 |
| 0 | Surfs Up | SARIMA | 0.000 | 0.036910 |
| 0 | Surfs Up | RandomForest | 0.000 | 1.132962 |
| 0 | Sweet Shack | LinearRegression | 0.007 | 0.004008 |
| 0 | Sweet Shack | RandomForest | 0.004 | 1.106282 |
| 0 | Sweet Shack | XGBoost | 0.002 | 0.133504 |
| 0 | Sweet Shack | SARIMA | 1.301 | 6.245150 |
Model is a good fit based on the following conditions:
1) Standardized residual
2) Histogram plus KDE estimate
3) Normal Q-Q
4) Correlogram (ACF plot)
SARIMA_model.plot_diagnostics(figsize = (15,12))
plt.show()
Observations:
1) Standardized residual
2) Histogram plus KDE estimate
3) Normal Q-Q
4) Correlogram (ACF plot)
# Prepare the dataset
fc1_data = sales_merged[['date', 'store_id', 'sales']]
# create store-wise daily aggregates
fc_agg = pd.DataFrame(round(fc1_data.groupby(['store_id','date'])['sales'].mean())).reset_index()
fc_agg.rename(columns={'date':'ds', 'sales':'y'}, inplace=True)
fc_agg = fc_agg.set_index(['store_id','ds'])
fc_agg.sort_index(inplace=True) # sort dates
fc_agg.reset_index(inplace=True)
fc_agg.head()
| store_id | ds | y | |
|---|---|---|---|
| 0 | 1 | 2019-01-01 | 165.0 |
| 1 | 1 | 2019-01-02 | 123.0 |
| 2 | 1 | 2019-01-03 | 163.0 |
| 3 | 1 | 2019-01-04 | 206.0 |
| 4 | 1 | 2019-01-05 | 202.0 |
# Splitting data into train and test. We will take the last 6 months as test data
split_date = '2021-07-01'
train = fc_agg[fc_agg.ds < split_date]
test = fc_agg[fc_agg.ds >= split_date]
train.head()
| store_id | ds | y | |
|---|---|---|---|
| 0 | 1 | 2019-01-01 | 165.0 |
| 1 | 1 | 2019-01-02 | 123.0 |
| 2 | 1 | 2019-01-03 | 163.0 |
| 3 | 1 | 2019-01-04 | 206.0 |
| 4 | 1 | 2019-01-05 | 202.0 |
test.tail()
| store_id | ds | y | |
|---|---|---|---|
| 6571 | 6 | 2021-12-27 | 1.0 |
| 6572 | 6 | 2021-12-28 | 3.0 |
| 6573 | 6 | 2021-12-29 | 0.0 |
| 6574 | 6 | 2021-12-30 | 0.0 |
| 6575 | 6 | 2021-12-31 | 2.0 |
# Train and predict using Prophet
metrics=[]
pred_df=pd.DataFrame()
for i, store in enumerate(store_list):
store_train = train[train.store_id == i+1][['ds','y']]
# creating test df with only dates
test_fc = m.make_future_dataframe(periods=184) # 6 months of test data
pred=[]
start_time = time.time()
# Instantiate the Prophet class
m = Prophet()
# Fit the model
m.fit(store_train)
# Predict
test_pred = m.predict(test_fc)
test_pred['store'] = store
pred_df = pd.concat([pred_df, test_pred], axis=0)
runtime = time.time() - start_time
# Evaluate
actual = test[test['store_id'] == i+1]['y']
predicted = test_pred[test_pred.ds >= split_date]['yhat']
rmse = np.sqrt(mean_squared_error(actual, predicted))
metrics.append({'Store': store,
'Algorithm': 'Prophet',
'Root Mean Squared Error': rmse,
'Runtime (in secs)': runtime
})
# Plot
fig1 = m.plot(test_pred)
plt.title(store)
metrics_df = pd.DataFrame(metrics)
model_comparison = pd.concat([model_comparison, metrics_df], axis=0).sort_values('Store')
DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/otrj1b4_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/7jz0n6o_.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=90863', 'data', 'file=/tmp/tmpxw5m34nc/otrj1b4_.json', 'init=/tmp/tmpxw5m34nc/7jz0n6o_.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_modelmbnginsv/prophet_model-20240628052554.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:54 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:54 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/_5puq79_.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/xlwcebi1.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=99479', 'data', 'file=/tmp/tmpxw5m34nc/_5puq79_.json', 'init=/tmp/tmpxw5m34nc/xlwcebi1.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_modelmfxrp568/prophet_model-20240628052555.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:55 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:55 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/uefmausy.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/xtka2zm6.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7975', 'data', 'file=/tmp/tmpxw5m34nc/uefmausy.json', 'init=/tmp/tmpxw5m34nc/xtka2zm6.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_modelyzfa6gxp/prophet_model-20240628052556.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:56 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:56 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/jh93set8.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/olawls6m.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=37604', 'data', 'file=/tmp/tmpxw5m34nc/jh93set8.json', 'init=/tmp/tmpxw5m34nc/olawls6m.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_modeltx7ipu6o/prophet_model-20240628052556.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:56 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:56 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/r6ppdvmw.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/apr1ka8m.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=53964', 'data', 'file=/tmp/tmpxw5m34nc/r6ppdvmw.json', 'init=/tmp/tmpxw5m34nc/apr1ka8m.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_model7t4xh6wk/prophet_model-20240628052557.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:57 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:57 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/6i_rq7_e.json DEBUG:cmdstanpy:input tempfile: /tmp/tmpxw5m34nc/4g3yy14u.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47918', 'data', 'file=/tmp/tmpxw5m34nc/6i_rq7_e.json', 'init=/tmp/tmpxw5m34nc/4g3yy14u.json', 'output', 'file=/tmp/tmpxw5m34nc/prophet_model6iz5xg6c/prophet_model-20240628052558.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000'] 05:25:58 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 05:25:59 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
# Plot store-wise trends
for i, store in enumerate(store_list):
print(store)
pred = pred_df[pred_df.store == store].drop('store', axis=1)
fig2 = m.plot_components(pred)
Bob's Diner
Beachfront Bar
Corner Cafe
Fou Cher
Surfs Up
Sweet Shack
model_comparison
| Store | Algorithm | Root Mean Squared Error | Runtime (in secs) | |
|---|---|---|---|---|
| 0 | Beachfront Bar | LinearRegression | 0.039000 | 0.042825 |
| 0 | Beachfront Bar | RandomForest | 0.030000 | 1.610173 |
| 0 | Beachfront Bar | XGBoost | 0.023000 | 0.137608 |
| 0 | Beachfront Bar | SARIMA | 2.024000 | 4.231126 |
| 1 | Beachfront Bar | Prophet | 0.508142 | 0.732511 |
| 0 | Bob's Diner | LinearRegression | 3.450000 | 0.009614 |
| 0 | Bob's Diner | RandomForest | 1.149000 | 1.774468 |
| 0 | Bob's Diner | XGBoost | 1.821000 | 2.799378 |
| 0 | Bob's Diner | SARIMA | 0.133000 | 13.243634 |
| 0 | Bob's Diner | Prophet | 34.248791 | 0.746303 |
| 0 | Corner Cafe | LinearRegression | 0.019000 | 0.004484 |
| 0 | Corner Cafe | RandomForest | 0.022000 | 1.123541 |
| 0 | Corner Cafe | XGBoost | 0.028000 | 0.129682 |
| 0 | Corner Cafe | SARIMA | 2.065000 | 28.520054 |
| 2 | Corner Cafe | Prophet | 0.447118 | 0.596837 |
| 3 | Fou Cher | Prophet | 1.069466 | 0.616112 |
| 0 | Fou Cher | RandomForest | 0.000000 | 1.128829 |
| 0 | Fou Cher | LinearRegression | 0.000000 | 0.006117 |
| 0 | Fou Cher | XGBoost | 0.000000 | 0.133886 |
| 0 | Fou Cher | SARIMA | 0.000000 | 0.040050 |
| 0 | Surfs Up | LinearRegression | 0.000000 | 0.004126 |
| 0 | Surfs Up | SARIMA | 0.000000 | 0.036910 |
| 0 | Surfs Up | RandomForest | 0.000000 | 1.132962 |
| 4 | Surfs Up | Prophet | 0.861536 | 0.705635 |
| 0 | Surfs Up | XGBoost | 0.000000 | 0.138709 |
| 0 | Sweet Shack | LinearRegression | 0.007000 | 0.004008 |
| 0 | Sweet Shack | RandomForest | 0.004000 | 1.106282 |
| 0 | Sweet Shack | XGBoost | 0.002000 | 0.133504 |
| 0 | Sweet Shack | SARIMA | 1.301000 | 6.245150 |
| 5 | Sweet Shack | Prophet | 1.406550 | 1.042681 |
Observations:
From the RMSE scores, we see that Random Forest performs consistently well across all the stores. Hence we will use Random Forest Classifier algorithm to make a forecast for the next year.
Create the dataset
# Creating the dataset to make a forecast for the next year 2022
date_list = pd.date_range('2022-01-01','2022-12-31')
forecast_2022 = pd.DataFrame(date_list, columns=['date'])
forecast_2022 = forecast_2022.resample('M', on='date').min().reset_index() #aggregate by month to get 1 date per month
# create the time series features for this dataset
forecast_2022 = create_ts_features(forecast_2022)
forecast_2022_ind = forecast_2022.set_index('date')
forecast_2022_ind.head()
| year | quarter | month | day | dayofyear | dayofweek | |
|---|---|---|---|---|---|---|
| date | ||||||
| 2022-01-31 | 2022 | 1 | 1 | 31 | 31 | 0 |
| 2022-02-28 | 2022 | 1 | 2 | 28 | 59 | 0 |
| 2022-03-31 | 2022 | 1 | 3 | 31 | 90 | 3 |
| 2022-04-30 | 2022 | 2 | 4 | 30 | 120 | 5 |
| 2022-05-31 | 2022 | 2 | 5 | 31 | 151 | 1 |
Build the model
# make predictions for next year
rfc_model = RandomForestRegressor(n_estimators = 1000, max_depth=5, random_state=123, min_samples_split=5)
forecast = pd.DataFrame()
df_fc = pd.DataFrame()
df_test = pd.DataFrame()
for i, store_name in enumerate(store_list):
store = fc1_agg[fc1_agg['store_id']==i+1].drop('store_id', axis=1)
# Splitting data into train and test. We will take the last 6 months as test data
split_date = '2021-07-01'
train = store.loc[store.index < split_date].copy()
test = store.loc[store.index >= split_date].copy()
# split the train and test datasets into X and y variables
X_train, y_train = train.drop('item_count', axis=1), train['item_count']
X_test, y_test = test.drop('item_count', axis=1), test['item_count']
# Fit
rfc_model.fit(X_train, y_train)
# Prdict
test['store_id'] = i+1
test['y_test_pred'] = rfc_model.predict(X_test)
df_test = pd.concat([test, df_test], axis=0)
# Forecast
y_test_fc = pd.DataFrame({
'Store id' : i+1,
'Store name': store_name,
'Item count forecast': rfc_model.predict(forecast_2022_ind)
})
# Prepare result
df_fc = pd.concat([forecast_2022, y_test_fc], axis=1)
forecast = pd.concat([forecast, df_fc], axis=0)
forecast.date = pd.to_datetime(forecast.date)
forecast = forecast.set_index('date')
Visualize sales volume and forecast for next year
# Creating a figure
fig, axs = plt.subplots(6, 1, figsize=(18, 18))
# Flatten the axes array for easy iteration
axs = axs.flatten()
# Adding subplots in a loop
for i, store in enumerate(store_list):
train_plt = fc1_agg[(fc1_agg.index < split_date) & (fc1_agg.store_id == i+1)]
test_plt = df_test[df_test['store_id'] == i+1]
fc_plt = forecast[forecast['Store id'] == i+1]
axs[i].plot(train_plt.index, train_plt.item_count, label='train')
axs[i].plot(test_plt.index, test_plt.item_count, label='test')
axs[i].plot(test_plt.index, test_plt.y_test_pred, label='prediction', alpha=0.5)
axs[i].plot(fc_plt.index, fc_plt['Item count forecast'], label='next year forecast')
axs[i].set_title(store, loc='left', fontweight='bold', color='#333333')
axs[0].legend(loc='upper left')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
fc1_data = sales_merged[['date', 'store_id', 'sales']]
# create daily aggregates across all stores
fc_agg = pd.DataFrame(round(fc1_data.groupby(['date'])['sales'].mean())).reset_index()
fc_agg = fc_agg.set_index(['date'])
fc_agg.sort_index(inplace=True) # sort dates
fc_agg.head()
# # Uncomment below code to create store-wise daily aggregates
# fc_agg = pd.DataFrame(round(fc1_data.groupby(['date', 'store_id'])['sales'].mean())).reset_index()
# fc_agg = fc_agg.set_index(['store_id','date'])
# fc_agg.sort_index(inplace=True) # sort dates
# fc_agg.head()
| sales | |
|---|---|
| date | |
| 2019-01-01 | 40.0 |
| 2019-01-02 | 31.0 |
| 2019-01-03 | 41.0 |
| 2019-01-04 | 50.0 |
| 2019-01-05 | 49.0 |
# Scaling the data
from sklearn.preprocessing import MinMaxScaler
fc1_agg = fc_agg.copy()
# Instantiate
scaler = MinMaxScaler(feature_range=(-1,1))
# Fit-Transform
scaled = scaler.fit_transform(fc1_agg)
# Update dataframe
fc1_agg['sales'] = scaled
fc1_agg.reset_index(inplace=True)
fc1_agg.set_index('date', inplace=True)
fc1_agg.head()
| sales | |
|---|---|
| date | |
| 2019-01-01 | -0.509804 |
| 2019-01-02 | -0.686275 |
| 2019-01-03 | -0.490196 |
| 2019-01-04 | -0.313725 |
| 2019-01-05 | -0.333333 |
# Splitting data into train and test. We will take the last 6 months as test data
split_date1 = '2021-04-01'
split_date2 = '2021-07-01'
train = fc1_agg.loc[fc1_agg.index < split_date1].copy()
val = fc1_agg.loc[(fc1_agg.index >= split_date1) & (fc1_agg.index < split_date2)].copy()
test = fc1_agg.loc[fc1_agg.index >= split_date2].copy()
print('Train data range: ', min(train.index) ,'-', max(train.index))
print('Validation data range: ', min(val.index) ,'-', max(val.index))
print('Test data range : ', min(test.index) ,'-', max(test.index))
Train data range: 2019-01-01 00:00:00 - 2021-03-31 00:00:00 Validation data range: 2021-04-01 00:00:00 - 2021-06-30 00:00:00 Test data range : 2021-07-01 00:00:00 - 2021-12-31 00:00:00
This involves creating X (features) and y (label). X is a sequence of prior timesteps causing the next timestep y. We choose the lag. eg 5
| X | Y |
|---|---|
| [ [1] [2] [3] [4] [5] ] | [6] |
| [ [2] [3] [4] [5] [6] ] | [7] |
| [ [3] [4] [5] [6] [7] ] | [8] |
# Define a function to create the lag data
def df_to_X_y(df, window_size=5):
df_as_np = df.to_numpy()
X = []
y = []
for i in range(len(df_as_np) - window_size):
row = [ [a] for a in df_as_np[i:i+window_size]]
X.append(row)
label = df_as_np[i+window_size]
y.append(label)
return np.array(X), np.array(y)
# Uncomment below code for store-wise analysis
# train = train[train.store_id == 1]['sales']
# val = val[val.store_id == 1]['sales']
# test = test[test.store_id == 1]['sales']
WINDOW_SIZE = 5
X_train, y_train = df_to_X_y(train, WINDOW_SIZE)
X_val, y_val = df_to_X_y(val, WINDOW_SIZE)
X_test, y_test = df_to_X_y(test, WINDOW_SIZE)
print('X_train, y_train: ', X_train.shape, y_train.shape)
print('X_val, y_val: ', X_val.shape, y_val.shape)
print('X_test, y_test: ', X_test.shape, y_test.shape)
X_train, y_train: (816, 5, 1, 1) (816, 1) X_val, y_val: (86, 5, 1, 1) (86, 1) X_test, y_test: (179, 5, 1, 1) (179, 1)
# Synthetic data generation
def generate_synthetic_data(data, days=365):
synthetic_data = data.sample(n=days, replace=True).reset_index()
synthetic_data.drop(columns=['index', 'date'], inplace=True)
synthetic_data['date'] = pd.date_range(start='2021-01-01', periods=days, freq='D')
return synthetic_data
synthetic_data = generate_synthetic_data(sales_merged_no_outliers, 365)
# Create time series features
create_ts_features(synthetic_data)
synthetic_data = synthetic_data[['date', 'year', 'quarter', 'month', 'day', 'dayofyear', 'dayofweek', 'store_id', 'store_name', 'item_id', 'item_name', 'kcal', 'price', 'item_count', 'sales']]
synthetic_data
| date | year | quarter | month | day | dayofyear | dayofweek | store_id | store_name | item_id | item_name | kcal | price | item_count | sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-01 | 2021 | 1 | 1 | 1 | 1 | 4 | 1 | Bob's Diner | 19 | Strawberry Smoothy | 145 | 2.89 | 96.0 | 381.48 |
| 1 | 2021-01-02 | 2021 | 1 | 1 | 2 | 2 | 5 | 4 | Fou Cher | 34 | Sweet Savory Cake | 721 | 27.47 | 0.0 | 0.00 |
| 2 | 2021-01-03 | 2021 | 1 | 1 | 3 | 3 | 6 | 5 | Corner Cafe | 63 | Frozen Chocolate Cake | 519 | 3.74 | 0.0 | 0.00 |
| 3 | 2021-01-04 | 2021 | 1 | 1 | 4 | 4 | 0 | 4 | Fou Cher | 82 | Martini | 556 | 5.45 | 0.0 | 0.00 |
| 4 | 2021-01-05 | 2021 | 1 | 1 | 5 | 5 | 1 | 5 | Corner Cafe | 50 | Pike Lunch | 653 | 26.37 | 0.0 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 360 | 2021-12-27 | 2021 | 4 | 12 | 27 | 361 | 0 | 4 | Fou Cher | 2 | Breaded Fish with Vegetables Meal | 772 | 15.09 | 0.0 | 0.00 |
| 361 | 2021-12-28 | 2021 | 4 | 12 | 28 | 362 | 1 | 1 | Bob's Diner | 67 | Sweet Lamb Cake | 558 | 7.00 | 10.0 | 70.00 |
| 362 | 2021-12-29 | 2021 | 4 | 12 | 29 | 363 | 2 | 4 | Fou Cher | 34 | Sweet Savory Cake | 721 | 27.47 | 0.0 | 0.00 |
| 363 | 2021-12-30 | 2021 | 4 | 12 | 30 | 364 | 3 | 3 | Sweet Shack | 30 | Fantastic Cake | 525 | 5.08 | 0.0 | 0.00 |
| 364 | 2021-12-31 | 2021 | 4 | 12 | 31 | 365 | 4 | 4 | Fou Cher | 82 | Martini | 556 | 5.45 | 0.0 | 0.00 |
365 rows × 15 columns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanAbsolutePercentageError, MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(WINDOW_SIZE, 1)))
model.add(LSTM(50))
model.add(Dense(1))
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 5, 50) 10400
lstm_1 (LSTM) (None, 50) 20200
dense (Dense) (None, 1) 51
=================================================================
Total params: 30651 (119.73 KB)
Trainable params: 30651 (119.73 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
# Saving the best model
cp = ModelCheckpoint('/content/drive/MyDrive/SalesForecasting_Datasets/model/', save_best_only=True)
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Fit the model
model.fit(X_train, y_train,
validation_data = (X_val, y_val),
epochs = 50,
callbacks = [cp]
)
Epoch 1/50 26/26 [==============================] - 14s 366ms/step - loss: 0.1554 - val_loss: 0.2784 Epoch 2/50 26/26 [==============================] - 8s 323ms/step - loss: 0.1472 - val_loss: 0.2705 Epoch 3/50 26/26 [==============================] - 9s 342ms/step - loss: 0.1434 - val_loss: 0.2653 Epoch 4/50 26/26 [==============================] - 8s 313ms/step - loss: 0.1373 - val_loss: 0.2509 Epoch 5/50 26/26 [==============================] - 8s 323ms/step - loss: 0.1281 - val_loss: 0.2340 Epoch 6/50 26/26 [==============================] - 8s 305ms/step - loss: 0.1122 - val_loss: 0.2066 Epoch 7/50 26/26 [==============================] - 8s 335ms/step - loss: 0.0927 - val_loss: 0.1662 Epoch 8/50 26/26 [==============================] - 8s 324ms/step - loss: 0.0843 - val_loss: 0.1429 Epoch 9/50 26/26 [==============================] - 8s 315ms/step - loss: 0.0794 - val_loss: 0.1417 Epoch 10/50 26/26 [==============================] - 9s 348ms/step - loss: 0.0765 - val_loss: 0.1312 Epoch 11/50 26/26 [==============================] - 8s 320ms/step - loss: 0.0702 - val_loss: 0.1198 Epoch 12/50 26/26 [==============================] - 9s 358ms/step - loss: 0.0658 - val_loss: 0.1191 Epoch 13/50 26/26 [==============================] - 7s 271ms/step - loss: 0.0632 - val_loss: 0.1118 Epoch 14/50 26/26 [==============================] - 8s 331ms/step - loss: 0.0567 - val_loss: 0.0906 Epoch 15/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0548 - val_loss: 0.0979 Epoch 16/50 26/26 [==============================] - 8s 327ms/step - loss: 0.0514 - val_loss: 0.0899 Epoch 17/50 26/26 [==============================] - 0s 13ms/step - loss: 0.0488 - val_loss: 0.0903 Epoch 18/50 26/26 [==============================] - 10s 388ms/step - loss: 0.0470 - val_loss: 0.0851 Epoch 19/50 26/26 [==============================] - 8s 310ms/step - loss: 0.0460 - val_loss: 0.0828 Epoch 20/50 26/26 [==============================] - 8s 330ms/step - loss: 0.0449 - val_loss: 0.0736 Epoch 21/50 26/26 [==============================] - 8s 334ms/step - loss: 0.0453 - val_loss: 0.0701 Epoch 22/50 26/26 [==============================] - 0s 13ms/step - loss: 0.0435 - val_loss: 0.0718 Epoch 23/50 26/26 [==============================] - 7s 297ms/step - loss: 0.0442 - val_loss: 0.0688 Epoch 24/50 26/26 [==============================] - 9s 364ms/step - loss: 0.0432 - val_loss: 0.0616 Epoch 25/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0442 - val_loss: 0.0718 Epoch 26/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0432 - val_loss: 0.0619 Epoch 27/50 26/26 [==============================] - 7s 273ms/step - loss: 0.0412 - val_loss: 0.0566 Epoch 28/50 26/26 [==============================] - 0s 10ms/step - loss: 0.0411 - val_loss: 0.0608 Epoch 29/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0412 - val_loss: 0.0701 Epoch 30/50 26/26 [==============================] - 9s 355ms/step - loss: 0.0401 - val_loss: 0.0544 Epoch 31/50 26/26 [==============================] - 1s 21ms/step - loss: 0.0415 - val_loss: 0.0618 Epoch 32/50 26/26 [==============================] - 1s 20ms/step - loss: 0.0392 - val_loss: 0.0661 Epoch 33/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0396 - val_loss: 0.0608 Epoch 34/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0400 - val_loss: 0.0610 Epoch 35/50 26/26 [==============================] - 0s 10ms/step - loss: 0.0384 - val_loss: 0.0596 Epoch 36/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0385 - val_loss: 0.0623 Epoch 37/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0390 - val_loss: 0.0614 Epoch 38/50 26/26 [==============================] - 0s 10ms/step - loss: 0.0381 - val_loss: 0.0560 Epoch 39/50 26/26 [==============================] - 8s 308ms/step - loss: 0.0389 - val_loss: 0.0492 Epoch 40/50 26/26 [==============================] - 0s 14ms/step - loss: 0.0377 - val_loss: 0.0536 Epoch 41/50 26/26 [==============================] - 0s 15ms/step - loss: 0.0378 - val_loss: 0.0583 Epoch 42/50 26/26 [==============================] - 0s 14ms/step - loss: 0.0373 - val_loss: 0.0588 Epoch 43/50 26/26 [==============================] - 0s 15ms/step - loss: 0.0371 - val_loss: 0.0501 Epoch 44/50 26/26 [==============================] - 8s 304ms/step - loss: 0.0369 - val_loss: 0.0484 Epoch 45/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0365 - val_loss: 0.0526 Epoch 46/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0372 - val_loss: 0.0511 Epoch 47/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0377 - val_loss: 0.0581 Epoch 48/50 26/26 [==============================] - 0s 11ms/step - loss: 0.0355 - val_loss: 0.0505 Epoch 49/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0375 - val_loss: 0.0612 Epoch 50/50 26/26 [==============================] - 0s 9ms/step - loss: 0.0359 - val_loss: 0.0553
<keras.src.callbacks.History at 0x7f7727671480>
# loading the best saved model into memory
from tensorflow.keras.models import load_model
model = load_model('/content/drive/MyDrive/SalesForecasting_Datasets/model/')
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
print(y_train_pred.shape)
26/26 [==============================] - 1s 5ms/step 3/3 [==============================] - 0s 6ms/step 6/6 [==============================] - 0s 5ms/step (816, 1)
# create dataframe with actuals and predictions
train_act = scaler.inverse_transform(y_train.reshape(-1, 1)).flatten()
train_pred = scaler.inverse_transform(y_train_pred).flatten()
val_act = scaler.inverse_transform(y_val.reshape(-1, 1)).flatten()
val_pred = scaler.inverse_transform(y_val_pred).flatten()
test_act = scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
test_pred = scaler.inverse_transform(y_test_pred).flatten()
train_df = pd.DataFrame(data = {'Dataset':'Train','Actuals': train_act, 'Predicted': train_pred})
val_df = pd.DataFrame(data = {'Dataset':'Validation','Actuals': val_act, 'Predicted': val_pred})
test_df = pd.DataFrame(data = {'Dataset':'Test','Actuals': test_act, 'Predicted': test_pred})
train_results = pd.concat([train_df, val_df, test_df], axis = 0)
train_results
| Dataset | Actuals | Predicted | |
|---|---|---|---|
| 0 | Train | 16.0 | 51.915676 |
| 1 | Train | 19.0 | 24.979496 |
| 2 | Train | 22.0 | 28.201834 |
| 3 | Train | 29.0 | 35.202896 |
| 4 | Train | 40.0 | 47.561543 |
| ... | ... | ... | ... |
| 174 | Test | 19.0 | 40.866379 |
| 175 | Test | 34.0 | 37.354183 |
| 176 | Test | 37.0 | 36.087856 |
| 177 | Test | 53.0 | 52.760036 |
| 178 | Test | 83.0 | 58.212421 |
1081 rows × 3 columns
Visualizing the predictions
plt.figure(figsize=(12, 8))
plt.subplots_adjust(hspace=0.7)
plt.subplot(3,1,1)
data = train_results[train_results.Dataset == 'Train']
plt.plot(data['Actuals'], label='Actual')
plt.plot(data['Predicted'], label='Predicted', alpha=0.7)
plt.title('Train', loc='left', fontweight='bold', color='#333333', pad=20)
plt.legend()
plt.subplot(3,1,2)
data = train_results[train_results.Dataset == 'Validation']
plt.plot(data['Actuals'], label='Actual')
plt.plot(data['Predicted'], label='Predicted', alpha=0.7)
plt.title('Validation', loc='left', fontweight='bold', color='#333333', pad=20)
plt.subplot(3,1,3)
data = train_results[train_results.Dataset == 'Test']
plt.plot(data['Actuals'], label='Actual')
plt.plot(data['Predicted'], label='Predicted', alpha=0.7)
plt.title('Test', loc='left', fontweight='bold', color='#333333', pad=20)
plt.tight_layout()
sns.despine()
plt.show()
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(data['Predicted'], data['Actuals'])
print(f'Mean Absolute Percentage Error (MAPE): {mape*100:.2f}%')
Mean Absolute Percentage Error (MAPE): 18.24%
Observations:
The test predicted values indicate that the model performs better when it is exposed to larger time series. We will now develop another model using the entire series for training, and use it to forecast for the next three months.
Prepare the data
# concatenate original data with synthetic data
full_data = pd.concat([sales_merged,synthetic_data])
fc1_data = full_data[['date', 'store_id', 'sales']]
# create aggregates day wise subset
fc_agg = pd.DataFrame(round(fc1_data.groupby(['date'])['sales'].mean())).reset_index()
fc_agg = fc_agg.set_index(['date'])
fc_agg.sort_index(inplace=True) # sort dates
# Fit-Transform
scaled = scaler.fit_transform(fc_agg)
# Update dataframe
fc_agg['sales'] = scaled
fc_agg.reset_index(inplace=True)
fc_agg.set_index('date', inplace=True)
# Split data into X and y
WINDOW_SIZE = 5
X_full, y_full = df_to_X_y(fc_agg, WINDOW_SIZE)
X_full.shape, y_full.shape
((1091, 5, 1, 1), (1091, 1))
Build the model
model_full = Sequential()
model_full.add(LSTM(50, return_sequences=True, input_shape=(WINDOW_SIZE, 1)))
model_full.add(LSTM(50))
model_full.add(Dense(1))
# Saving the best model - cannot save as there is no val_loss computed to identify best model
# cp_full = ModelCheckpoint('/content/drive/MyDrive/SalesForecasting_Datasets/full_model/', save_best_only=True)
# Compile the model
model_full.compile(optimizer = 'adam',
loss = 'mean_squared_error'
)
# Fit the model
model_full.fit(X_full, y_full,
epochs = 50
)
Epoch 1/50 35/35 [==============================] - 6s 8ms/step - loss: 0.1795 Epoch 2/50 35/35 [==============================] - 0s 8ms/step - loss: 0.1706 Epoch 3/50 35/35 [==============================] - 0s 9ms/step - loss: 0.1634 Epoch 4/50 35/35 [==============================] - 0s 7ms/step - loss: 0.1434 Epoch 5/50 35/35 [==============================] - 0s 8ms/step - loss: 0.1117 Epoch 6/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0932 Epoch 7/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0828 Epoch 8/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0722 Epoch 9/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0663 Epoch 10/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0621 Epoch 11/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0573 Epoch 12/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0546 Epoch 13/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0530 Epoch 14/50 35/35 [==============================] - 0s 11ms/step - loss: 0.0521 Epoch 15/50 35/35 [==============================] - 0s 11ms/step - loss: 0.0517 Epoch 16/50 35/35 [==============================] - 0s 13ms/step - loss: 0.0547 Epoch 17/50 35/35 [==============================] - 0s 11ms/step - loss: 0.0505 Epoch 18/50 35/35 [==============================] - 0s 13ms/step - loss: 0.0508 Epoch 19/50 35/35 [==============================] - 0s 11ms/step - loss: 0.0494 Epoch 20/50 35/35 [==============================] - 0s 11ms/step - loss: 0.0475 Epoch 21/50 35/35 [==============================] - 0s 12ms/step - loss: 0.0486 Epoch 22/50 35/35 [==============================] - 0s 12ms/step - loss: 0.0483 Epoch 23/50 35/35 [==============================] - 0s 12ms/step - loss: 0.0487 Epoch 24/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0483 Epoch 25/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0470 Epoch 26/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0459 Epoch 27/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0459 Epoch 28/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0453 Epoch 29/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0451 Epoch 30/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0453 Epoch 31/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0443 Epoch 32/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0446 Epoch 33/50 35/35 [==============================] - 0s 9ms/step - loss: 0.0450 Epoch 34/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0440 Epoch 35/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0430 Epoch 36/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0432 Epoch 37/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0422 Epoch 38/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0417 Epoch 39/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0422 Epoch 40/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0412 Epoch 41/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0411 Epoch 42/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0399 Epoch 43/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0398 Epoch 44/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0392 Epoch 45/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0401 Epoch 46/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0387 Epoch 47/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0389 Epoch 48/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0377 Epoch 49/50 35/35 [==============================] - 0s 7ms/step - loss: 0.0381 Epoch 50/50 35/35 [==============================] - 0s 8ms/step - loss: 0.0374
<keras.src.callbacks.History at 0x7f7726a1feb0>
Make predictions
# Taking the last 5 timesteps as input for makeing the next prediction
fc_input = fc_agg[- WINDOW_SIZE:]
fc_input = np.reshape(fc_input, (1, WINDOW_SIZE, 1))
fc_input
array([[[-0.94059406],
[-0.6039604 ],
[-0.56435644],
[-0.26732673],
[ 0.32673267]]])
fc_3mo = []
for i in range(90):
pred = model_full.predict(fc_input).flatten()
# Store predicted value
fc_3mo.append(pred)
# Append predicted value to the end of the fc_input window to make the next prediction
fc_input = np.append(fc_input[:, 1:, :], [[pred]], axis=1)
# convert into suitable format for descaling
fc_3mo = np.array(fc_3mo).reshape(-1,1)
forecast = scaler.inverse_transform(fc_3mo).flatten()
1/1 [==============================] - 1s 1s/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 20ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 23ms/step
Create final dataframe with actuals and predicted values
# Gather list of dates for each phase
train_date_list = pd.DataFrame(pd.date_range('2019-01-06','2021-03-31'), columns=['date'])
val_date_list = pd.DataFrame(pd.date_range('2021-04-06','2021-06-30'), columns=['date'])
test_date_list = pd.DataFrame(pd.date_range('2021-07-06','2021-12-31'), columns=['date'])
fc_date_list = pd.DataFrame(pd.date_range('2022-01-01','2022-03-31'), columns=['date'])
# concatenate dates with predicted values
train_ds1 = train_date_list
train_ds2 = train_results[train_results.Dataset == 'Train']
train_ds = pd.concat([train_ds1, train_ds2], axis = 1)
val_ds1 = val_date_list
val_ds2 = train_results[train_results.Dataset == 'Validation']
val_ds = pd.concat([val_ds1, val_ds2], axis = 1)
test_ds1 = test_date_list
test_ds2 = train_results[train_results.Dataset == 'Test']
test_ds = pd.concat([test_ds1, test_ds2], axis = 1)
fc_ds1 = fc_date_list
fc_ds2 = pd.DataFrame(data={'Dataset': 'Forecast', 'Actuals': None, 'Predicted': forecast})
fc_ds = pd.concat([fc_ds1, fc_ds2], axis = 1)
# create final dataframe by appending all 4
ds = pd.concat([train_ds, val_ds, test_ds, fc_ds], axis = 0)
ds.shape
(1171, 4)
ds.head()
| date | Dataset | Actuals | Predicted | |
|---|---|---|---|---|
| 0 | 2019-01-06 | Train | 16.0 | 51.915676 |
| 1 | 2019-01-07 | Train | 19.0 | 24.979496 |
| 2 | 2019-01-08 | Train | 22.0 | 28.201834 |
| 3 | 2019-01-09 | Train | 29.0 | 35.202896 |
| 4 | 2019-01-10 | Train | 40.0 | 47.561543 |
Visualizing the forecast
plt.figure(figsize=(20, 6))
train_plt = ds[ds.Dataset == 'Train']
val_plt = ds[ds.Dataset == 'Validation']
test_plt = ds[ds.Dataset == 'Test']
fc_plt = ds[ds.Dataset == 'Forecast']
plt.plot(train_plt['date'], train_plt['Actuals'], label='Train')
plt.plot(val_plt['date'], val_plt['Actuals'], label='Validation')
plt.plot(test_plt['date'], test_plt['Actuals'], label='Test act', alpha=0.7)
plt.plot(test_plt['date'], test_plt['Predicted'], label='Test pred', alpha=0.7)
plt.plot(fc_plt['date'], fc_plt['Predicted'], label='Forecast')
plt.title('Sales - Actuals and 3 month forecast', loc='center', fontweight='bold', color='#333333', pad=10)
plt.ylabel('Sales')
plt.legend()
sns.despine()
plt.show()